library(xlsx)
#install.packages('sjPlot')
library(sjPlot)
library(sandwich)
library(lmtest)
library(miceadds)
library(glue)

# set your working directory
setwd("...")

# read data from your working directory
df <- read.csv("regression_data_en.csv")

# replicate statistics from Table 1

# T-test for differences in imprisonment length for male and female offenders
t_st <- t.test(df[df$gender_offender=='Male offender', 'sentence_period_whole'], df[df$gender_offender=='Female offender', 'sentence_period_whole'])
print(glue("t-statistic {format(round(t_st$statistic, digits = 1), nsmall = 1)}, p-value {format(round(t_st$p.value), nsmall = 3)}"))
sent_all <- round(mean(df[, 'sentence_period_whole']), 1)
sent_f <- round(mean(df[df$gender_offender=='Female offender', 'sentence_period_whole']), 1)
sent_m <- round(mean(df[df$gender_offender=='Male offender', 'sentence_period_whole']), 1)
stats <- cbind(sent_all, sent_f, sent_m)
row.names(stats) <- "Length of imprisonment"

# Chi-square tests for differences in all categorical features for male and female offenders
cols = c("gender_victim", "gender_judge", "rel_type", "children", "recidivism_type", "guilty_plea", "soft_binary", "hard_binary")

# Table with proportions
for (col in cols) {
  chi <- chisq.test(df[, "gender_offender"], df[, col], correct = FALSE)
  print(glue("{col}: chi {format(round(chi$statistic, digits = 1), nsmall = 1)}, p-value {format(round(chi$p.value, digits=3), nsmall = 3)}"))
  prop_all <- round(table(df[, col]) / nrow(df) * 100)
  prop_f <- round(table(df[df$gender_offender=='Female offender', col]) / nrow(df[df$gender_offender=='Female offender', ]) * 100)
  prop_m <- round(table(df[df$gender_offender=='Male offender', col]) / nrow(df[df$gender_offender=='Male offender', ]) * 100)
  stats <- rbind(stats, t(rbind(prop_all, prop_f, prop_m)))
  }
colnames(stats) <- c("All", "Female offenders", "Male offenders")
print(stats)
print(c(nrow(df), nrow(df[df$gender_offender=='Female offender',]), nrow(df[df$gender_offender=='Male offender',])))

# Models 1, 2, 3
reg1 <- lm.cluster(sentence_period_whole ~ gender_offender + gender_victim + gender_judge + recidivism_type + guilty_plea + soft_binary + hard_binary + year + region,
                 cluster = 'court_name',
                 data = df)
reg2 <- lm.cluster(sentence_period_whole ~ gender_offender + gender_victim + gender_judge + rel_type + recidivism_type + guilty_plea + soft_binary + hard_binary + year + region,
                   cluster = 'court_name',
                   data = df)
reg3 <- lm.cluster(sentence_period_whole ~ gender_offender + gender_victim + gender_judge + rel_type + children + recidivism_type + guilty_plea + soft_binary + hard_binary + year + region,
                   cluster = 'court_name',
                   data = df)

# Collapse region names and years in an output table 
region_categories <- unique(df$region) 
region_categories <- paste(region_categories, collapse=",")
hide_region <- glue('region [{region_categories}]')
year_categories <- unique(df$year) 
year_categories <- paste(year_categories, collapse=",")
hide_year <- glue('year [{year_categories}]')

# Table 2
tab_model(reg1$lm_res, reg2$lm_res, reg3$lm_res, 
          show.se=TRUE, show.p=FALSE, show.ci = FALSE, string.pred = "Variables", 
          string.est = "B", string.se = "S.E.", p.style="numeric_stars", 
          dv.labels =c("(1)", "(2)", "(3)"), 
          rm.terms = c(hide_region, hide_year))

# Models 4, 5
reg4 <- lm.cluster(sentence_period_whole ~ gender_victim + gender_judge + rel_type + children + recidivism_type + guilty_plea + soft_binary + hard_binary + year + region,
                   cluster = 'court_name',
                   data = df[df$gender_offender=='Male offender',])

reg5 <- lm.cluster(sentence_period_whole ~ gender_victim + gender_judge + rel_type + children + recidivism_type + guilty_plea + soft_binary + hard_binary + year + region,
                   cluster = 'court_name',
                   data = df[df$gender_offender=='Female offender',])

# Table 3
tab_model(reg4$lm_res, reg5$lm_res, 
          show.se=TRUE, show.p=FALSE, show.ci = TRUE, 
          string.pred = "Variables", string.est = "B", string.se = "S.E.", 
          p.style="numeric_stars", dv.labels =c("Male offenders", "Female offenders"), 
          rm.terms = c(hide_region, hide_year))

# CIs are calculated separately as tab_model for some reason cannot process them properly
tidy_res <- tidy(reg4$lm_res, conf.int = TRUE)
head(tidy_res, n = 14)

tidy_res <- tidy(reg5$lm_res, conf.int = TRUE)
head(tidy_res, n = 14)
